Code
library(tidyverse)
library(ggplot2)
library(car)
library(caret)
library(rFerns)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)
library(Hmisc)
library(mlbench)
library(pROC)library(tidyverse)
library(ggplot2)
library(car)
library(caret)
library(rFerns)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)
library(Hmisc)
library(mlbench)
library(pROC)qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |>
mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
`English Speaking`=relevel(`English Speaking`,ref="Not at all"),
Ethnicity = relevel(Ethnicity,ref="Chinese"),
Religion=relevel(Religion,ref="None")) |>
mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
"$10,000 - $19,999" ~"Below",
"$20,000 - $29,999"~"Below",
"$30,000 - $39,999"~"Below",
"$40,000 - $49,999"~"Below",
"$50,000 - $59,999"~"Below",
"$60,000 - $69,999"~"Above",
"$70,000 and over"~"Above",
.default=Income)) |>
mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |>
mutate(across(`Familiarity with America`:`Familiarity with Ethnic Origin`,~factor(.x,levels=c("Very low","Low", "High", "Very high"))),
across(`Identify Ethnically`,~factor(.x,levels=c("Not at all","Not very close","Somewhat close","Very close"))),
across(`Belonging`,~factor(.x,levels=c("Not at all","Not very much","Somewhat","Very much"))),
`Primary Language` = as.factor(`Primary Language`))New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
We will be using the Prediction Set to run an analysis of deviance to check model performance.
rfdata <- qol |> select(`Unmet Health Need`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)rfdata |> gtsummary::tbl_summary(include=Unmet.Health.Need)| Characteristic | N = 1,9741 |
|---|---|
| Unmet.Health.Need | |
| 0 | 1,770 (90%) |
| Yes | 204 (10%) |
| 1 n (%) | |
n_minority <- rfdata |> filter(Unmet.Health.Need=="Yes")|> nrow()
rf_options_for_vsurf <- list(
sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
strata = rfdata[,1], # Stratify by the response variable
importance = TRUE # Ensure importance is calculated
)
VSURF(Unmet.Health.Need~.,
rfdata,
na.action="na.omit",
parallel=T,verbose=F,
rf.options=rf_options_for_vsurf)->vsurf.modWarning in VSURF.formula(Unmet.Health.Need ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 10 secs
VSURF selected:
18 variables at thresholding step (in 4.5 secs)
6 variables at interpretation step (in 2.7 secs)
1 variables at prediction step (in 2.8 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "English.Speaking"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "English.Speaking" "Duration.of.Residency" "Age"
[4] "Ethnicity" "Discrimination" "English.Difficulties"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.1030142
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 English.Speaking 0.01267 0.00099 black
2 Duration.of.Residency 0.01083 0.00048 black
3 Age 0.00982 0.00042 black
4 Ethnicity 0.00891 0.00052 red
5 Discrimination 0.00765 0.00041 black
6 English.Difficulties 0.00645 0.00031 black
7 Dental.Insurance 0.00621 0.00063 black
8 Religion 0.00530 0.00039 black
9 Full.Time.Employment 0.00274 0.00021 black
10 Belonging 0.00246 0.00036 black
11 Identify.Ethnically 0.00218 0.00037 black
12 Income_median 0.00215 0.00026 black
13 Health.Insurance 0.00209 0.00035 black
14 Primary.Language 0.00192 0.00020 black
15 Familiarity.with.America 0.00168 0.00033 black
16 US.Born 0.00157 0.00014 black
17 Familiarity.with.Ethnic.Origin 0.00072 0.00026 black
18 Gender 0.00055 0.00022 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_unmethealth.png", width=6, height=4.5,units="in")all_formula <- Unmet.Health.Need~.
pred_vars <- names(rfdata[,-1])[vsurf.mod$varselect.interp]
pred_vars <- if("Ethnicity" %in% pred_vars){
pred_vars
} else {
c(pred_vars,"Ethnicity")
}
mod_form <- reformulate(pred_vars, response="Unmet.Health.Need")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Health.Need")
options(contrasts = c("contr.sum","contr.poly"))
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Unmet.Health.Need ~ English.Speaking + Duration.of.Residency +
Age + Discrimination + English.Difficulties
Model 2: Unmet.Health.Need ~ English.Speaking + Duration.of.Residency +
Age + Ethnicity + Discrimination + English.Difficulties
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1964 1207.5
2 1959 1187.4 5 20.067 0.001214 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 1301.232 1283.36
car::Anova(mod1, test.statistic = "F")Analysis of Deviance Table (Type II tests)
Response: Unmet.Health.Need
Error estimate based on Pearson residuals
Sum Sq Df F value Pr(>F)
English.Speaking 35.80 3 12.1296 7.258e-08 ***
Duration.of.Residency 0.08 1 0.0767 0.781867
Age 0.76 1 0.7678 0.380994
Ethnicity 20.07 5 4.0792 0.001095 **
Discrimination 28.31 1 28.7749 9.092e-08 ***
English.Difficulties 5.76 3 1.9529 0.119068
Residuals 1927.39 1959
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.016054 0.274226 -7.352 1.96e-13 ***
English.Speaking1 1.029224 0.237976 4.325 1.53e-05 ***
English.Speaking2 0.439003 0.148990 2.947 0.00321 **
English.Speaking3 -1.058307 0.210024 -5.039 4.68e-07 ***
Duration.of.Residency -0.002485 0.009047 -0.275 0.78362
Age -0.005818 0.006725 -0.865 0.38698
Ethnicity1 -0.276237 0.177019 -1.560 0.11864
Ethnicity2 -0.317491 0.225698 -1.407 0.15951
Ethnicity3 0.244252 0.254044 0.961 0.33632
Ethnicity4 0.191657 0.171791 1.116 0.26458
Ethnicity5 -0.403333 0.367192 -1.098 0.27202
Discrimination 0.862615 0.161124 5.354 8.62e-08 ***
English.Difficulties1 -0.069735 0.163448 -0.427 0.66963
English.Difficulties2 0.101377 0.142650 0.711 0.47729
English.Difficulties3 -0.271418 0.139367 -1.948 0.05147 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1312.2 on 1973 degrees of freedom
Residual deviance: 1187.4 on 1959 degrees of freedom
AIC: 1217.4
Number of Fisher Scoring iterations: 5
mod1 |> emmeans::emmeans(~Ethnicity, type="response") Ethnicity prob SE df asymp.LCL asymp.UCL
Chinese 0.1050 0.0153 Inf 0.0786 0.139
Asian Indian 0.1012 0.0237 Inf 0.0633 0.158
Filipino 0.1649 0.0406 Inf 0.0998 0.260
Korean 0.1578 0.0226 Inf 0.1184 0.207
Other 0.0937 0.0370 Inf 0.0421 0.196
Vietnamese 0.2133 0.0259 Inf 0.1669 0.269
Results are averaged over the levels of: English.Speaking, Discrimination, English.Difficulties
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
mod1 |> emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff") contrast estimate SE df z.ratio p.value
Chinese effect -0.276 0.177 Inf -1.560 0.3190
Asian Indian effect -0.317 0.226 Inf -1.407 0.3190
Filipino effect 0.244 0.254 Inf 0.961 0.3363
Korean effect 0.192 0.172 Inf 1.116 0.3264
Other effect -0.403 0.367 Inf -1.098 0.3264
Vietnamese effect 0.561 0.161 Inf 3.484 0.0030
Results are averaged over the levels of: English.Speaking, Discrimination, English.Difficulties
Results are given on the log odds ratio (not the response) scale.
P value adjustment: fdr method for 6 tests
rfdata <- qol |> select(`Unmet Dental Needs`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)rfdata |> gtsummary::tbl_summary(include=Unmet.Dental.Needs)| Characteristic | N = 1,9681 |
|---|---|
| Unmet.Dental.Needs | |
| 0 | 1,744 (89%) |
| Yes | 224 (11%) |
| 1 n (%) | |
rfdata |> gtsummary::tbl_summary(include=Unmet.Dental.Needs)| Characteristic | N = 1,9681 |
|---|---|
| Unmet.Dental.Needs | |
| 0 | 1,744 (89%) |
| Yes | 224 (11%) |
| 1 n (%) | |
n_minority <- rfdata |> filter(Unmet.Dental.Needs=="Yes")|> nrow()
rf_options_for_vsurf <- list(
sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
strata = rfdata[,1], # Stratify by the response variable
importance = TRUE # Ensure importance is calculated
)
VSURF(Unmet.Dental.Needs~.,
rfdata,
na.action="na.omit",
parallel=T,verbose=F,
rf.options=rf_options_for_vsurf)->vsurf.modWarning in VSURF.formula(Unmet.Dental.Needs ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 7.7 secs
VSURF selected:
17 variables at thresholding step (in 4.6 secs)
1 variables at interpretation step (in 2.6 secs)
1 variables at prediction step (in 0.5 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Dental.Insurance"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Dental.Insurance"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.1148374
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Dental.Insurance 0.01362 0.00081 black
2 Age 0.00999 0.00033 black
3 English.Speaking 0.00930 0.00070 black
4 Duration.of.Residency 0.00875 0.00048 black
5 Religion 0.00734 0.00065 black
6 Income_median 0.00542 0.00059 black
7 Ethnicity 0.00395 0.00052 red
8 Familiarity.with.America 0.00384 0.00046 black
9 Primary.Language 0.00349 0.00035 black
10 English.Difficulties 0.00328 0.00052 black
11 Full.Time.Employment 0.00185 0.00028 black
12 US.Born 0.00146 0.00018 black
13 Familiarity.with.Ethnic.Origin 0.00145 0.00032 black
14 Health.Insurance 0.00137 0.00034 black
15 Discrimination 0.00107 0.00026 black
16 Belonging 0.00077 0.00036 black
17 Identify.Ethnically 0.00063 0.00038 black
18 Gender -0.00015 0.00020 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_unmetdental.png", width=6, height=4.5,units="in")Dental Insurance is the only variable in the Prediction Set, which means Ethnicity might not be a significant predictor of unmet dental needs.
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Unmet.Dental.Needs")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Dental.Needs")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Unmet.Dental.Needs ~ Dental.Insurance
Model 2: Unmet.Dental.Needs ~ Dental.Insurance + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1966 1300.0
2 1961 1291.3 5 8.7216 0.1207
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 1344.41 1315.208
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.11153 0.09115 -23.165 <2e-16 ***
Dental.Insurance1 0.69784 0.07851 8.889 <2e-16 ***
Ethnicity1 -0.12599 0.15545 -0.810 0.4177
Ethnicity2 -0.31150 0.17526 -1.777 0.0755 .
Ethnicity3 0.17410 0.21838 0.797 0.4253
Ethnicity4 0.23328 0.14812 1.575 0.1153
Ethnicity5 -0.17960 0.30374 -0.591 0.5543
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1395.0 on 1967 degrees of freedom
Residual deviance: 1291.3 on 1961 degrees of freedom
AIC: 1305.3
Number of Fisher Scoring iterations: 5
mod1 |> emmeans::emmeans(~Ethnicity, type="response") Ethnicity prob SE df asymp.LCL asymp.UCL
Chinese 0.0964 0.0138 Inf 0.0726 0.127
Asian Indian 0.0814 0.0139 Inf 0.0580 0.113
Filipino 0.1259 0.0266 Inf 0.0823 0.188
Korean 0.1326 0.0170 Inf 0.1026 0.170
Other 0.0919 0.0297 Inf 0.0480 0.169
Vietnamese 0.1299 0.0176 Inf 0.0991 0.168
Results are averaged over the levels of: Dental.Insurance
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
mod1 |> emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff") contrast estimate SE df z.ratio p.value
Chinese effect -0.126 0.155 Inf -0.810 0.5104
Asian Indian effect -0.311 0.175 Inf -1.777 0.3458
Filipino effect 0.174 0.218 Inf 0.797 0.5104
Korean effect 0.233 0.148 Inf 1.575 0.3458
Other effect -0.180 0.304 Inf -0.591 0.5543
Vietnamese effect 0.210 0.154 Inf 1.361 0.3469
Results are averaged over the levels of: Dental.Insurance
Results are given on the log odds ratio (not the response) scale.
P value adjustment: fdr method for 6 tests
The non-significant analysis of deviance implies that Ethnicity might not be a significant predictor of unmet dental needs.
rfdata <- qol |>
select(`Physical Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)rfdata |> gtsummary::tbl_summary(include=Physical.Check.up)| Characteristic | N = 1,9661 |
|---|---|
| Physical.Check.up | |
| 0 | 630 (32%) |
| Yes | 1,336 (68%) |
| 1 n (%) | |
rfdata |> gtsummary::tbl_summary(include=Physical.Check.up)| Characteristic | N = 1,9661 |
|---|---|
| Physical.Check.up | |
| 0 | 630 (32%) |
| Yes | 1,336 (68%) |
| 1 n (%) | |
n_minority <- rfdata |> filter(Physical.Check.up=="0")|> nrow()
rf_options_for_vsurf <- list(
sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
strata = rfdata[,1], # Stratify by the response variable
importance = TRUE # Ensure importance is calculated
)
VSURF(Physical.Check.up~.,
rfdata,
na.action="na.omit",
parallel=T,verbose=F,
rf.options=rf_options_for_vsurf)->vsurf.modWarning in VSURF.formula(Physical.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 11.1 secs
VSURF selected:
17 variables at thresholding step (in 5.4 secs)
5 variables at interpretation step (in 3 secs)
2 variables at prediction step (in 2.7 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Duration.of.Residency" "Health.Insurance"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Duration.of.Residency" "Age" "Health.Insurance"
[4] "Dental.Insurance" "Income_median"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.282528
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Duration.of.Residency 0.03000 0.00073 black
2 Age 0.02716 0.00131 black
3 Health.Insurance 0.02245 0.00044 black
4 Dental.Insurance 0.01348 0.00057 black
5 Income_median 0.00728 0.00052 black
6 Ethnicity 0.00654 0.00060 red
7 EnglishSpeak 0.00549 0.00068 black
8 Employment 0.00464 0.00039 black
9 EnglishDiff 0.00458 0.00054 black
10 Religion 0.00408 0.00052 black
11 Familiarity.with.America 0.00251 0.00037 black
12 Familiarity.with.Ethnic.Origin 0.00238 0.00045 black
13 Gender 0.00232 0.00056 black
14 Primary.Language 0.00109 0.00030 black
15 US.Born 0.00085 0.00021 black
16 Discrimination 0.00060 0.00031 black
17 Identify.Ethnically 0.00050 0.00033 black
18 Belonging 0.00021 0.00036 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_PC.png", width=6, height=4.5,units="in")pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Physical.Check.up")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Physical.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Physical.Check.up ~ Duration.of.Residency + Age + Health.Insurance +
Dental.Insurance + Income_median
Model 2: Physical.Check.up ~ Duration.of.Residency + Age + Health.Insurance +
Dental.Insurance + Income_median + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1960 2152.4
2 1955 2115.6 5 36.766 6.673e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 2199.048 2197.895
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.912314 0.163698 -5.573 2.50e-08 ***
Duration.of.Residency 0.032732 0.005457 5.998 2.00e-09 ***
Age 0.019095 0.003837 4.976 6.48e-07 ***
Health.Insurance1 -0.595463 0.080852 -7.365 1.77e-13 ***
Dental.Insurance1 -0.272499 0.063466 -4.294 1.76e-05 ***
Income_median1 -0.185135 0.059851 -3.093 0.00198 **
Ethnicity1 -0.037393 0.108227 -0.346 0.72971
Ethnicity2 0.059656 0.116050 0.514 0.60722
Ethnicity3 0.397812 0.165291 2.407 0.01610 *
Ethnicity4 -0.524335 0.112804 -4.648 3.35e-06 ***
Ethnicity5 -0.297137 0.192876 -1.541 0.12342
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2466.2 on 1965 degrees of freedom
Residual deviance: 2115.6 on 1955 degrees of freedom
AIC: 2137.6
Number of Fisher Scoring iterations: 4
mod1 |> emmeans::emmeans(~Ethnicity, type="response") Ethnicity prob SE df asymp.LCL asymp.UCL
Chinese 0.589 0.0295 Inf 0.531 0.646
Asian Indian 0.613 0.0311 Inf 0.550 0.672
Filipino 0.689 0.0417 Inf 0.602 0.765
Korean 0.469 0.0316 Inf 0.408 0.531
Other 0.525 0.0573 Inf 0.414 0.635
Vietnamese 0.690 0.0311 Inf 0.626 0.748
Results are averaged over the levels of: Health.Insurance, Dental.Insurance, Income_median
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
mod1 |> emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff") contrast estimate SE df z.ratio p.value
Chinese effect -0.0374 0.108 Inf -0.346 0.7297
Asian Indian effect 0.0597 0.116 Inf 0.514 0.7287
Filipino effect 0.3978 0.165 Inf 2.407 0.0322
Korean effect -0.5243 0.113 Inf -4.648 <.0001
Other effect -0.2971 0.193 Inf -1.541 0.1851
Vietnamese effect 0.4014 0.127 Inf 3.151 0.0049
Results are averaged over the levels of: Health.Insurance, Dental.Insurance, Income_median
Results are given on the log odds ratio (not the response) scale.
P value adjustment: fdr method for 6 tests
rfdata <- qol |>
select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)rfdata |> gtsummary::tbl_summary(include=Dentist.Check.up)| Characteristic | N = 1,9611 |
|---|---|
| Dentist.Check.up | |
| 0 | 811 (41%) |
| Yes | 1,150 (59%) |
| 1 n (%) | |
rfdata |> gtsummary::tbl_summary(include=Dentist.Check.up)| Characteristic | N = 1,9611 |
|---|---|
| Dentist.Check.up | |
| 0 | 811 (41%) |
| Yes | 1,150 (59%) |
| 1 n (%) | |
n_minority <- rfdata |> filter(Dentist.Check.up=="0") |> nrow()
rf_options_for_vsurf <- list(
sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
strata = rfdata[,1], # Stratify by the response variable
importance = TRUE # Ensure importance is calculated
)
VSURF(Dentist.Check.up~.,
rfdata,
na.action="na.omit",
parallel=T,verbose=F,
rf.options=rf_options_for_vsurf)->vsurf.modWarning in VSURF.formula(Dentist.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 10.2 secs
VSURF selected:
18 variables at thresholding step (in 5.4 secs)
3 variables at interpretation step (in 3.2 secs)
2 variables at prediction step (in 1.6 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Dental.Insurance" "Duration.of.Residency"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Dental.Insurance" "Duration.of.Residency" "Religion"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.295793
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Dental.Insurance 0.05197 0.00073 black
2 Duration.of.Residency 0.04911 0.00074 black
3 Religion 0.01538 0.00083 black
4 Ethnicity 0.01337 0.00042 red
5 Age 0.01120 0.00076 black
6 Health.Insurance 0.00834 0.00048 black
7 EnglishSpeak 0.00778 0.00049 black
8 Income_median 0.00686 0.00044 black
9 Identify.Ethnically 0.00370 0.00042 black
10 EnglishDiff 0.00321 0.00066 black
11 Familiarity.with.America 0.00274 0.00046 black
12 Familiarity.with.Ethnic.Origin 0.00261 0.00045 black
13 Belonging 0.00207 0.00045 black
14 Employment 0.00200 0.00028 black
15 Gender 0.00197 0.00042 black
16 US.Born 0.00176 0.00021 black
17 Primary.Language 0.00166 0.00025 black
18 Discrimination 0.00073 0.00022 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_DC.png", width=6, height=4.5,units="in")pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Dentist.Check.up")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Dentist.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Dentist.Check.up ~ Dental.Insurance + Duration.of.Residency +
Religion
Model 2: Dentist.Check.up ~ Dental.Insurance + Duration.of.Residency +
Religion + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1952 2209.7
2 1947 2201.0 5 8.7418 0.1198
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 2307.098 2277.933
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.542307 0.115283 -4.704 2.55e-06 ***
Dental.Insurance1 -0.804673 0.054352 -14.805 < 2e-16 ***
Duration.of.Residency 0.045686 0.004692 9.738 < 2e-16 ***
Religion1 0.098471 0.158743 0.620 0.535
Religion2 0.344886 0.178252 1.935 0.053 .
Religion3 0.225075 0.166045 1.356 0.175
Religion4 -0.193586 0.240057 -0.806 0.420
Religion5 -0.152920 0.312546 -0.489 0.625
Religion6 -0.423634 0.345307 -1.227 0.220
Ethnicity1 0.272980 0.132916 2.054 0.040 *
Ethnicity2 -0.225308 0.241135 -0.934 0.350
Ethnicity3 0.197420 0.177079 1.115 0.265
Ethnicity4 -0.141224 0.137876 -1.024 0.306
Ethnicity5 -0.045418 0.202181 -0.225 0.822
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2659.6 on 1960 degrees of freedom
Residual deviance: 2201.0 on 1947 degrees of freedom
AIC: 2229
Number of Fisher Scoring iterations: 4
mod1 |> emmeans::emmeans(~Ethnicity, type="response") Ethnicity prob SE df asymp.LCL asymp.UCL
Chinese 0.612 0.0388 Inf 0.534 0.685
Asian Indian 0.489 0.0555 Inf 0.383 0.597
Filipino 0.594 0.0520 Inf 0.489 0.690
Korean 0.510 0.0431 Inf 0.426 0.594
Other 0.534 0.0590 Inf 0.419 0.646
Vietnamese 0.531 0.0415 Inf 0.450 0.611
Results are averaged over the levels of: Dental.Insurance, Religion
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
mod1 |> emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff", type="response") |> summary(infer=T) contrast odds.ratio SE df asymp.LCL asymp.UCL null z.ratio
Chinese effect 1.314 0.175 Inf 0.925 1.87 1 2.054
Asian Indian effect 0.798 0.192 Inf 0.423 1.51 1 -0.934
Filipino effect 1.218 0.216 Inf 0.764 1.94 1 1.115
Korean effect 0.868 0.120 Inf 0.604 1.25 1 -1.024
Other effect 0.956 0.193 Inf 0.561 1.63 1 -0.225
Vietnamese effect 0.943 0.130 Inf 0.656 1.36 1 -0.425
p.value
0.2400
0.5252
0.5252
0.5252
0.8223
0.8048
Results are averaged over the levels of: Dental.Insurance, Religion
Confidence level used: 0.95
Conf-level adjustment: bonferroni method for 6 estimates
Intervals are back-transformed from the log odds ratio scale
P value adjustment: fdr method for 6 tests
Tests are performed on the log odds ratio scale
rfdata <- qol |> select(`Folkmedicine`,Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)rfdata |> gtsummary::tbl_summary(include=Folkmedicine)| Characteristic | N = 1,9471 |
|---|---|
| Folkmedicine | |
| 0 | 1,681 (86%) |
| Yes | 266 (14%) |
| 1 n (%) | |
rfdata |> gtsummary::tbl_summary(include=Folkmedicine)| Characteristic | N = 1,9471 |
|---|---|
| Folkmedicine | |
| 0 | 1,681 (86%) |
| Yes | 266 (14%) |
| 1 n (%) | |
n_minority <- rfdata |> filter(Folkmedicine=="Yes") |> nrow()
rf_options_for_vsurf <- list(
sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
strata = rfdata[,1], # Stratify by the response variable
importance = TRUE # Ensure importance is calculated
)
VSURF(Folkmedicine~.,
rfdata,
na.action="na.omit",
parallel=T,
verbose=F,
rf.options=rf_options_for_vsurf)->vsurf.modWarning in VSURF.formula(Folkmedicine ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 9 secs
VSURF selected:
18 variables at thresholding step (in 4.6 secs)
3 variables at interpretation step (in 2.8 secs)
2 variables at prediction step (in 1.5 secs)
VSURF ran in parallel on a PSOCK cluster and used 15 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Age" "Ethnicity"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Age" "Duration.of.Residency" "Ethnicity"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.1383924
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Age 0.01739 0.00062 black
2 Duration.of.Residency 0.01355 0.00065 black
3 Ethnicity 0.01320 0.00073 red
4 English.Speaking 0.00902 0.00074 black
5 Religion 0.00710 0.00070 black
6 Income_median 0.00562 0.00043 black
7 English.Difficulties 0.00480 0.00055 black
8 Familiarity.with.America 0.00270 0.00046 black
9 Primary.Language 0.00259 0.00029 black
10 Familiarity.with.Ethnic.Origin 0.00259 0.00047 black
11 Full.Time.Employment 0.00241 0.00032 black
12 Dental.Insurance 0.00117 0.00035 black
13 Discrimination 0.00112 0.00028 black
14 Health.Insurance 0.00086 0.00014 black
15 US.Born 0.00071 0.00017 black
16 Identify.Ethnically 0.00069 0.00034 black
17 Gender 0.00050 0.00031 black
18 Belonging 0.00023 0.00042 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_AltMedicine.png", width=6, height=4.5,units="in")pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp])
mod_form <- reformulate(pred_vars, response="Folkmedicine")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Folkmedicine")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Folkmedicine ~ Age + Duration.of.Residency
Model 2: Folkmedicine ~ Age + Duration.of.Residency + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1944 1507.7
2 1939 1450.9 5 56.75 5.693e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 1511.497 1530.377
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.135528 0.204648 -15.322 < 2e-16 ***
Age 0.022611 0.004636 4.877 1.08e-06 ***
Duration.of.Residency 0.007417 0.005960 1.244 0.2134
Ethnicity1 0.522926 0.131708 3.970 7.18e-05 ***
Ethnicity2 -0.286686 0.173196 -1.655 0.0979 .
Ethnicity3 -0.219541 0.217195 -1.011 0.3121
Ethnicity4 0.757469 0.133299 5.682 1.33e-08 ***
Ethnicity5 -0.212500 0.287863 -0.738 0.4604
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1552.9 on 1946 degrees of freedom
Residual deviance: 1450.9 on 1939 degrees of freedom
AIC: 1466.9
Number of Fisher Scoring iterations: 5
mod1 |> emmeans::emmeans(~Ethnicity, type="response") Ethnicity prob SE df asymp.LCL asymp.UCL
Chinese 0.1741 0.0176 Inf 0.1422 0.2113
Asian Indian 0.0857 0.0142 Inf 0.0617 0.1180
Filipino 0.0912 0.0203 Inf 0.0585 0.1394
Korean 0.2104 0.0208 Inf 0.1725 0.2541
Other 0.0917 0.0280 Inf 0.0496 0.1634
Vietnamese 0.0665 0.0125 Inf 0.0459 0.0955
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
mod1 |> emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff", type="response") |> summary(infer=T) contrast odds.ratio SE df asymp.LCL asymp.UCL null z.ratio
Chinese effect 1.687 0.222 Inf 1.192 2.388 1 3.970
Asian Indian effect 0.751 0.130 Inf 0.475 1.186 1 -1.655
Filipino effect 0.803 0.174 Inf 0.453 1.424 1 -1.011
Korean effect 2.133 0.284 Inf 1.500 3.032 1 5.682
Other effect 0.809 0.233 Inf 0.378 1.728 1 -0.738
Vietnamese effect 0.570 0.105 Inf 0.351 0.927 1 -3.053
p.value
0.0002
0.1468
0.3745
<.0001
0.4604
0.0045
Confidence level used: 0.95
Conf-level adjustment: bonferroni method for 6 estimates
Intervals are back-transformed from the log odds ratio scale
P value adjustment: fdr method for 6 tests
Tests are performed on the log odds ratio scale